#Clear the environment
rm(list=ls())
#Loading libraries
library(tm)
library(proxy)
library(RTextTools)
library(fpc)
library(wordcloud)
library(cluster)
library(stringi)
library(dplyr)
library(ggplot2)
library(tidyr)
library(tidytext)
library(data.table)
library(cluster)
library(radarchart)
library(repr)
#Data Pre-processing
#Enron Emails
enron_vocab <- read.csv("dataset/vocab.enron.txt", header = FALSE, col.names = c("word"), 
 stringsAsFactors = FALSE) %>% 
 mutate(wordID = 1:28102)

enron_words <- read.csv("dataset/docword.enron.txt", header = FALSE, sep = " ", 
 col.names = c("docID", "wordID", "count"), skip = 3)

enron_words <- merge(enron_words, enron_vocab, by = "wordID") %>% 
 select(docID, word, count)
#NIPS data
nips_vocab <- read.csv("dataset/vocab.nips.txt", header = FALSE, col.names = c("word"), 
 stringsAsFactors = FALSE) %>% 
 mutate(wordID = 1:12419)

nips_words <- read.csv("dataset/docword.nips.txt", header = FALSE, sep = " ", 
 col.names = c("docID", "wordID", "count"), skip = 3)

nips_words <- merge(nips_words, nips_vocab, by = "wordID") %>% 
 select(docID, word, count)
#KOS data
kos_vocab <- read.csv("dataset/vocab.kos.txt", header = FALSE, col.names = c("word"), 
 stringsAsFactors = FALSE) %>% 
 mutate(wordID = 1:6906)

kos_words <- read.csv("dataset/docword.kos.txt", header = FALSE, sep = " ", 
 col.names = c("docID", "wordID", "count"), skip = 3)

kos_words <- merge(kos_words, kos_vocab, by = "wordID") %>% 
 select(docID, word, count)
#Exploration
enron_unique_words = aggregate(enron_words['count'], by=enron_words['word'], sum)
nips_unique_words = aggregate(nips_words['count'], by=nips_words['word'], sum)
kos_unique_words = aggregate(kos_words['count'], by=kos_words['word'], sum)
enron_top = enron_unique_words[order(enron_unique_words$count,decreasing=T)[1:10],]
nips_top = nips_unique_words[order(nips_unique_words$count,decreasing=T)[1:10],]
kos_top = kos_unique_words[order(kos_unique_words$count,decreasing=T)[1:10],]
g <- ggplot(data=enron_top, aes(x=enron_top$word, y=enron_top$count))
g + geom_bar(stat="identity") + xlab("Words") + ylab("Count") + ggtitle("Top 10 words")

g <- ggplot(data=nips_top, aes(x=nips_top$word, y=nips_top$count))
g + geom_bar(stat="identity") + xlab("Words") + ylab("Count") + ggtitle("Top 10 words")

g <- ggplot(data=kos_top, aes(x=kos_top$word, y=kos_top$count))
g + geom_bar(stat="identity") + xlab("Words") + ylab("Count") + ggtitle("Top 10 words")

# Sentiment Analysis
afinn <- get_sentiments("afinn")
nrc <- get_sentiments("nrc")
bing <- get_sentiments("bing")
# enron sentiment based on AFINN lexicon
enron_afinn <- enron_words %>% 
 # Inner Join to AFINN lexicon
 inner_join(afinn, by = c("word" = "word")) %>%
 # Count by score and document ID
 count(score, docID)

enron_afinn_agg <- enron_afinn %>% 
 # Group by line
 group_by(docID) %>%
 # Sum scores by line
 summarize(total_score = sum(score))

ggplot(enron_afinn_agg, aes(docID, total_score)) +
 geom_smooth()

# enron sentiment based on nrc lexicon
enron_nrc <- inner_join(enron_words, nrc, by = c("word" = "word"))

# DataFrame of counts
enron_nrc <- enron_nrc %>% 
 # group by sentiment
 group_by(sentiment) %>% 
 # total count by sentiment
 summarize(total_count = sum(count))

# Plotting the sentiment counts
ggplot(enron_nrc, aes(x = sentiment, y = total_count)) +
 geom_col()

# enron sentiment by bing lexicon
enron_bing <- enron_words %>%
 # inner join to the lexicon
 inner_join(bing, by = c("word" = "word")) %>%
 # count by sentiment, index
 count(sentiment, docID) %>%
 # spreading the sentiments
 spread(sentiment, n, fill=0) %>%
 mutate(
 # adding polarity field
 polarity = positive - negative,
 # adding line number field
 docID = unique(docID)
 )

# plotting the sentiment
ggplot(enron_bing, aes(docID, polarity)) + 
 geom_smooth() +
 geom_hline(yintercept = 0, color = "red") +
 ggtitle("Enron Emails Chronological Polarity")

# enron frequency analysis
enron_sents <- inner_join(enron_words, bing, by = c("word" = "word"))

# tidy sentiment calculation
enron_tidy_sentiment <- enron_sents %>% 
 count(word, sentiment, wt = count) %>%
 spread(sentiment, n, fill = 0) %>%
 mutate(polarity = positive - negative)

# subsetting the data for words with high polarity
enron_tidy_small <- enron_tidy_sentiment %>% 
 filter(abs(polarity) >= 1000)

# adding polarity
enron_tidy_pol <- enron_tidy_small %>% 
 mutate(
 pol = ifelse(polarity>0, "positive", "negative")
 )

# plotting the word frequency
ggplot(
 enron_tidy_pol, 
 aes(reorder(word, polarity), polarity, fill = pol)) +
 geom_bar(stat = "identity") + 
 ggtitle("Enron Emails: Sentiment Word Frequency") + 
 theme(axis.text.x = element_text(angle = 90, vjust = -0.1))

# enron emotional introspection
enron_sentiment <- inner_join(enron_words, nrc)

# dropping positive or negative
enron_pos_neg <- enron_sentiment %>%
 filter(!grepl("positive|negative", sentiment))

# counting terms by sentiment then spread 
enron_tidy <- enron_pos_neg %>% 
 count(sentiment, term = word) %>% 
 spread(sentiment, n, fill = 0) %>%
 as.data.frame()

# setting row names
rownames(enron_tidy) <- enron_tidy[, 1]

# dropping terms column
enron_tidy[, 1] <- NULL

# comparison cloud
comparison.cloud(enron_tidy, max.words = 200, title.size = 1.5)

# enron radarchart
enron_sentiment <- inner_join(enron_words, nrc)

# dropping positive or negative
enron_pos_neg <- enron_sentiment %>%
 filter(!grepl("positive|negative", sentiment))

# tidy count
enron_tally <- enron_pos_neg %>%
 group_by(sentiment) %>%
 tally()

# JavaScript radar chart
chartJSRadar(enron_tally)
#Document Term Matrix
enron_dtm <- enron_words %>%
  cast_dtm(docID, word, count)

nips_dtm <- nips_words %>%
  cast_dtm(docID, word, count)

kos_dtm <- kos_words %>%
  cast_dtm(docID, word, count)
inspect(enron_dtm)
## <<DocumentTermMatrix (documents: 39861, terms: 28099)>>
## Non-/sparse entries: 3710420/1116343819
## Sparsity           : 100%
## Maximal term length: 49
## Weighting          : term frequency (tf)
## Sample             :
##        Terms
## Docs    business california company customer electricity energy market
##   23414       72          3      49       33           0      1     20
##   25892        5          1      12        1           0      2      7
##   28717        3         43      14       16          35     45     11
##   31297       25         44      10       17          25     51     13
##   33802       19         80      26       40          49     25     17
##   33940        5          1      12        1           0      2      7
##   33959       55         11      80       35           0      1     25
##   33971       72          3      49       33           0      1     20
##   34007       25         44      10       17          25     51     13
##   34027        3         43      14       16          35     45     11
##        Terms
## Docs    meeting plan power
##   23414       1   20     0
##   25892      13   13     6
##   28717       7   24    59
##   31297       3   38    55
##   33802       4   22   116
##   33940      13   13     6
##   33959       6   20     1
##   33971       1   20     0
##   34007       3   38    55
##   34027       7   24    59
inspect(nips_dtm)
## <<DocumentTermMatrix (documents: 1500, terms: 12375)>>
## Non-/sparse entries: 746316/17816184
## Sparsity           : 96%
## Maximal term length: 23
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   algorithm data function input learning model network neural set
##   1090         0    2        3    22        4    28       8      9   6
##   1461         2   29        3     1       22    36       1      0   4
##   1471         5   10        7    15       34     6      17      7  33
##   178          0    0       18     3       13     6      70     38   4
##   4           17   28       35    21       26    11      16     10  10
##   512          5    7        2    16        4    33       7      1  16
##   56           1    5       15     6        4     5      32     21  21
##   682          5    1        4    30       18     3      58      9   5
##   746          1    1        1    49        3     4      37      9   8
##   919          0    2        2    24       26    17      42     21   2
##       Terms
## Docs   system
##   1090     13
##   1461     15
##   1471      8
##   178       6
##   4        21
##   512      48
##   56       22
##   682       2
##   746       5
##   919      15
inspect(kos_dtm)
## <<DocumentTermMatrix (documents: 3430, terms: 6906)>>
## Non-/sparse entries: 353160/23334420
## Sparsity           : 99%
## Maximal term length: 40
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   bush campaign democratic general house iraq kerry november poll war
##   1037    1        0          1      18     2    4     2       11    2   2
##   1197   25        7          3       3     1    3     8        2    1   0
##   1304    3        0          2       6     3   10     2       11    2   4
##   1739   12        2          4       0     1    1     1        1    1   0
##   1832   17        0          0       0     7   29     0        0    0   2
##   1955    2        6          3       1     2    0     4       10    5   1
##   232     3        4          1       0     5    1     1        0   11   1
##   2355    1        6         14       1     1    0     0        1    0   0
##   3411    5        4          3       1     2    0     5       10    4   1
##   480     1        0          1       2     1    2     6        0    0   2
#TF-IDF Weighting Scheme
enron_dtm_tfidf <- weightTfIdf(enron_dtm, normalize = TRUE)
nips_dtm_tfidf <- weightTfIdf(nips_dtm, normalize = TRUE)
kos_dtm_tfidf <- weightTfIdf(kos_dtm, normalize = TRUE)
inspect(enron_dtm_tfidf)
## <<DocumentTermMatrix (documents: 39861, terms: 28099)>>
## Non-/sparse entries: 3710420/1116343819
## Sparsity           : 100%
## Maximal term length: 49
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample             :
##        Terms
## Docs      attached date     market meeting number office     report
##   1487  0.01350825    0 0.00000000       0      0      0 0.07622168
##   19095 0.00000000    0 0.00000000       0      0      0 0.00000000
##   19314 0.00000000    0 0.00000000       0      0      0 0.00000000
##   19374 0.00000000    0 0.00000000       0      0      0 0.00000000
##   21785 0.00000000    0 0.00351855       0      0      0 0.00000000
##   21922 0.00000000    0 0.00000000       0      0      0 0.00000000
##   27395 0.00000000    0 0.00000000       0      0      0 0.00000000
##   5686  0.00000000    0 0.00000000       0      0      0 0.00000000
##   5750  0.00000000    0 0.00000000       0      0      0 0.00000000
##   6492  0.00000000    0 0.00000000       0      0      0 0.00000000
##        Terms
## Docs    request       team      think
##   1487        0 0.00000000 0.02748471
##   19095       0 0.00000000 0.00000000
##   19314       0 0.00000000 0.00000000
##   19374       0 0.00000000 0.00000000
##   21785       0 0.00000000 0.00000000
##   21922       0 0.00000000 0.00000000
##   27395       0 0.00000000 0.00000000
##   5686        0 0.00000000 0.00000000
##   5750        0 0.01331161 0.00000000
##   6492        0 0.00000000 0.00000000
inspect(nips_dtm_tfidf)
## <<DocumentTermMatrix (documents: 1500, terms: 12375)>>
## Non-/sparse entries: 746316/17816184
## Sparsity           : 96%
## Maximal term length: 23
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample             :
##       Terms
## Docs   cell circuit image    learning     neuron object speech training
##   1088    0       0     0 0.000000000 0.00000000      0      0        0
##   1227    0       0     0 0.000000000 0.00000000      0      0        0
##   1365    0       0     0 0.000000000 0.00000000      0      0        0
##   1500    0       0     0 0.000000000 0.00000000      0      0        0
##   553     0       0     0 0.000000000 0.00000000      0      0        0
##   697     0       0     0 0.000000000 0.00000000      0      0        0
##   819     0       0     0 0.000000000 0.00000000      0      0        0
##   83      0       0     0 0.005527718 0.04293003      0      0        0
##   86      0       0     0 0.000000000 0.00000000      0      0        0
##   954     0       0     0 0.000000000 0.00000000      0      0        0
##       Terms
## Docs   unit visual
##   1088    0      0
##   1227    0      0
##   1365    0      0
##   1500    0      0
##   553     0      0
##   697     0      0
##   819     0      0
##   83      0      0
##   86      0      0
##   954     0      0
inspect(kos_dtm_tfidf)
## <<DocumentTermMatrix (documents: 3430, terms: 6906)>>
## Non-/sparse entries: 353160/23334420
## Sparsity           : 99%
## Maximal term length: 40
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample             :
##       Terms
## Docs         bush clark dean    edwards iraq kerry november percent poll
##   1639 0.00000000     0    0 0.00000000    0     0        0       0    0
##   1685 0.00000000     0    0 0.00000000    0     0        0       0    0
##   195  0.00000000     0    0 0.03764289    0     0        0       0    0
##   196  0.00000000     0    0 0.06984344    0     0        0       0    0
##   2098 0.00000000     0    0 0.00000000    0     0        0       0    0
##   2649 0.00000000     0    0 0.00000000    0     0        0       0    0
##   2757 0.00000000     0    0 0.00000000    0     0        0       0    0
##   328  0.00607109     0    0 0.00000000    0     0        0       0    0
##   382  0.00000000     0    0 0.00000000    0     0        0       0    0
##   838  0.00000000     0    0 0.00000000    0     0        0       0    0
##       Terms
## Docs        senate
##   1639 0.000000000
##   1685 0.011717515
##   195  0.000000000
##   196  0.000000000
##   2098 0.000000000
##   2649 0.000000000
##   2757 0.000000000
##   328  0.000000000
##   382  0.000000000
##   838  0.008218167
dtm.matrix = as.matrix(enron_dtm_tfidf[1:1000,])
set.seed(142)   
wordcloud(colnames(dtm.matrix), dtm.matrix[3, ], max.words = 200, rot.per=0.2, colors=brewer.pal(6, "Dark2"))

dtm.matrix = as.matrix(nips_dtm_tfidf[1:1000,])
set.seed(142)   
wordcloud(colnames(dtm.matrix), dtm.matrix[3, ], max.words = 200, rot.per=0.2, colors=brewer.pal(6, "Dark2"))

dtm.matrix = as.matrix(kos_dtm_tfidf[1:1000,])
set.seed(142)   
wordcloud(colnames(dtm.matrix), dtm.matrix[3, ], max.words = 200, rot.per=0.2, colors=brewer.pal(6, "Dark2"))

# Distance Matrix using Cosine Measure
em  <- as.matrix(enron_dtm_tfidf[1:1000,])
enron_distMatrix <- dist(em, method="cosine")
nm  <- as.matrix(nips_dtm_tfidf[1:20,])
nips_distMatrix <- dist(nm, method="cosine")
km  <- as.matrix(kos_dtm_tfidf[1:20,])
kos_distMatrix <- dist(km, method="cosine")
m2  <- as.matrix(enron_dtm_tfidf[1:20,])
distMatrix2 <- dist(m2, method="cosine")
#Clustering
#Hierarchical Clustering
groups <- hclust(distMatrix2, method="ward.D")
plot(groups, cex=0.9, hang=-1)
rect.hclust(groups, k=7)

groups <- hclust(nips_distMatrix, method="ward.D")
plot(groups, cex=0.9, hang=-1)
rect.hclust(groups, k=3)

groups <- hclust(kos_distMatrix, method="ward.D")
plot(groups, cex=0.9, hang=-1)
rect.hclust(groups, k=4)

#K Means Clustering algorithm
kfit <- kmeans(distMatrix2, 7, nstart=100)
clusplot(as.matrix(distMatrix2), kfit$cluster, color=T, shade=T, labels=2, lines=0)

kfit <- kmeans(nips_distMatrix, 3, nstart=100)
clusplot(as.matrix(nips_distMatrix), kfit$cluster, color=T, shade=T, labels=2, lines=0)

kfit <- kmeans(kos_distMatrix, 4, nstart=100)
clusplot(as.matrix(kos_distMatrix), kfit$cluster, color=T, shade=T, labels=2, lines=0)

#Kmeans 
#look for “elbow” in plot of summed intra-cluster distances (withinss) as fn of k
wss <- 2:30
for (i in 2:30) wss[i] <- sum(kmeans(enron_distMatrix,centers=i,nstart=25)$withinss)
plot(2:30, wss[2:30], type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")